#!/usr/bin/env python

from __future__ import print_function
import argparse
import os
import sys

from lxml import html
from PIL import Image, ImageDraw


def get_prop(node, name):
    title = node.get('title')
    if not title:
        return None
    props = title.split(';')
    for prop in props:
        (key, args) = prop.split(None, 1)
        if key == name:
            return args.strip('"')
    return None


def get_bbox(node):
    bbox = get_prop(node, 'bbox')
    if not bbox:
        return None
    return tuple([int(x) for x in bbox.split()])


parser = argparse.ArgumentParser(
    description=('Cut a page (horizontally) into two pages in the middle '
                 'such that the most of the bounding boxes are separated '
                 'nicely, e.g. cutting double pages or double columns')
)
parser.add_argument('file', nargs='?', default=sys.stdin)
parser.add_argument('-d', '--debug', action="store_true")
args = parser.parse_args()

doc = html.parse(args.file)

pages = doc.xpath("//*[@class='ocr_page']")

for page in pages:

    try:
        filename = get_prop(page, 'image')
        filename = os.path.join(os.path.dirname(args.file), filename)
        image = Image.open(filename)
        debug_image = Image.open(filename)
        dr = ImageDraw.Draw(debug_image)
        image_found = True
    except IOError:
        print("Warning: Image not found!")
        args.debug = False
        image_found = False

    bbox = get_bbox(page)
    middle = bbox[2] / 2

    left_ends = []
    right_starts = []
    for line in doc.xpath("//*[@class='ocr_line']"):
        b = get_bbox(line)
        if (b[0] > middle):
            pos = "right"
        elif (b[2] < middle):
            pos = "left"
        elif (b[2] - middle > middle - b[1]):
            pos = "right"
        else:
            pos = "left"
        if (pos == "right"):
            right_starts.append(b[0])
            if (args.debug):
                dr.rectangle(b, fill=32)
        else:
            left_ends.append(b[2])
            if (args.debug):
                dr.rectangle(b, fill=96)

    left_ends.sort()
    right_starts.sort()
    n = len(left_ends)
    m = len(right_starts)
    middle_left = left_ends[n // 2]
    middle_right = right_starts[m // 2]

    middle = int((middle_left + middle_right) / 2)
    print("Cutting at", middle)

    if (image_found):

        if filename[-4] == ".":
            name = filename[:-3]
            suffix = filename[-3:]
        else:
            name = filename
            suffix = ""

        if (args.debug):
            dr.line(
                (middle_left, 0, middle_left, debug_image.size[1]),
                fill=64,
                width=3)
            dr.line(
                (middle_right, 0, middle_right, debug_image.size[1]),
                fill=64,
                width=3)
            dr.line(
                (middle, 0, middle, debug_image.size[1]), fill=128, width=5)
            debug_output = name + "cut." + suffix
            debug_image.save(debug_output)
            print("debug output is saved in", debug_output)

        left = image.crop((0, 0, middle, image.size[1]))
        left_name = name + "left." + suffix
        left.save(left_name)
        print("left page is saved in", left_name)
        right = image.crop((middle, 0, image.size[0], image.size[1]))
        right_name = name + "right." + suffix
        right.save(right_name)
        print("right page is saved in", right_name)
